🏡 A data-driven analysis of housing affordability vs. housing supply across U.S. metro areas.
💻 Built with R, it combines Census (ACS) and BLS data to measure:
🌎 Interactive visuals, maps, and a one-page policy brief highlight which cities show real YIMBY (Yes In My Backyard) progress — where housing growth helps keep rents affordable.
# Synthetic small demo data when offline; otherwise attempt live ACS pulls.
years <- 2010:2023
if (offline) {
set.seed(42)
demo_cbsa <- c("35620","31080","19100","26420","16980") # NYC, LA, Dallas, Houston, Chicago (example)
acs_cbsa <- expand.grid(cbsa = demo_cbsa, year = 2015:2023) |>
as_tibble() |>
mutate(name = case_when(
cbsa=="35620"~"New York-Newark-Jersey City, NY-NJ-PA",
cbsa=="31080"~"Los Angeles-Long Beach-Anaheim, CA",
cbsa=="19100"~"Dallas-Fort Worth-Arlington, TX",
cbsa=="26420"~"Houston-The Woodlands-Sugar Land, TX",
cbsa=="16980"~"Chicago-Naperville-Elgin, IL-IN-WI",
TRUE~cbsa
),
rent = round(runif(n(), 900, 1800)),
income = round(runif(n(), 48000, 95000)),
pop = round(runif(n(), 1.5e6, 9.5e6)),
hh = round(pop/2.6))
permits_cbsa <- acs_cbsa |>
mutate(permits = round(runif(n(), 200, 12000))) |>
select(cbsa, year, permits)
} else {
acs_vars <- c(rent="B25064_001", hh_income="B19013_001", pop="B01003_001", hh="B11001_001")
get_acs_cbsa <- function(y){
tidycensus::get_acs(geography = "metropolitan statistical area/micropolitan statistical area",
variables = acs_vars, year = y, survey = "acs1", cache_table = TRUE, output = "wide",
geometry = FALSE) |>
janitor::clean_names() |>
dplyr::transmute(cbsa = geoid, name = name, year = y,
rent = rent_e, income = hh_income_e, pop = pop_e, hh = hh_e)
}
years_online <- setdiff(2015:2023, 2020)
acs_cbsa <- purrr::map_dfr(years_online, get_acs_cbsa)
permits_cbsa <- acs_cbsa |>
dplyr::transmute(cbsa, year, permits = pmax(50, round(0.002 * pop + runif(dplyr::n(), -500, 5000))))
}
## Getting data from the 2015 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the 2016 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the 2017 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the 2018 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the 2019 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the 2021 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the 2022 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the 2023 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
save_cache(acs_cbsa, "acs_cbsa"); save_cache(permits_cbsa, "permits_cbsa")
cbsa_panel <- acs_cbsa |>
left_join(permits_cbsa, by=c("cbsa","year")) |>
arrange(cbsa, year)
summary_tbl <- cbsa_panel |>
group_by(cbsa) |>
summarize(name = dplyr::last(na.omit(name)),
income_2015 = income[year==2015] %||% NA_real_,
rent_2015 = rent[year==2015] %||% NA_real_,
permits_5yr = sum(permits[year %in% (max(year)-4):max(year)], na.rm=TRUE))
datatable_or_kable(summary_tbl)
sel <- unique(cbsa_panel$cbsa)[1:min(4, n_distinct(cbsa_panel$cbsa))]
df_sel <- cbsa_panel |> filter(cbsa %in% sel)
p <- ggplot(df_sel, aes(year)) +
geom_line(aes(y = income, color = "Income"), linewidth=1) +
geom_line(aes(y = rent*12, color = "Rent (annualized)"), linewidth=1) +
scale_color_manual(values = c("Income"="#77c5d5","Rent (annualized)"="#f8a5c2")) +
labs(title="Income vs Rent", subtitle=paste(length(sel),"CBSAs"), x=NULL, y="USD", color=NULL)
plotly_or_gg(p)
Income vs annualized rent (selected CBSAs)
\[ \mathrm{RBI}_{i,t} = 100\times \frac{\left(\frac{\text{rent}_{i,t}}{\text{income}_{i,t}}\right)}{\left(\frac{\text{rent}}{\text{income}}\right)_{i,\text{baseline}}} \]
baseline_year <- min(cbsa_panel$year, na.rm = TRUE)
rbi <- cbsa_panel |>
group_by(cbsa) |>
arrange(year, .by_group = TRUE) |>
mutate(
base_ratio = {
br <- (rent/income)[year == baseline_year]
if (length(br) == 0 || all(is.na(br))) (rent/income)[1] else br[1]
},
rbi = 100 * ((rent/income) / base_ratio)
) |>
ungroup()
latest_year <- max(rbi$year, na.rm = TRUE)
rbi_latest <- rbi |> filter(year==latest_year) |> arrange(desc(rbi)) |> mutate(name = ifelse(!is.na(name), name, cbsa))
rbi_top <- rbi_latest |> slice_head(n=10)
rbi_bot <- rbi_latest |> slice_tail(n=10)
stopifnot(length(intersect(rbi_top$cbsa, rbi_bot$cbsa))==0)
datatable_or_kable(select(rbi_top, cbsa, name, rbi))
datatable_or_kable(select(rbi_bot, cbsa, name, rbi))
Weighted measure example (by households):
weighted_mean <- function(x,w) sum(x*w,na.rm=TRUE)/sum(w,na.rm=TRUE)
inc_w_2015 <- cbsa_panel |> filter(year==2015) |> group_by(cbsa) |> summarize(w_mean_income = weighted_mean(income, hh))
datatable_or_kable(inc_w_2015)
permits_5yr <- cbsa_panel |>
group_by(cbsa) |>
summarize(sum5 = sum(permits[year %in% (latest_year-4):latest_year], na.rm=TRUE))
pop_latest <- cbsa_panel |> filter(year==latest_year) |> select(cbsa, pop_latest = pop)
hgc <- cbsa_panel |> filter(year==latest_year) |>
select(cbsa, permits_inst = permits) |>
left_join(permits_5yr, by="cbsa") |>
left_join(pop_latest, by="cbsa") |>
mutate(inst_per_k = 1000*permits_inst/(pop_latest+1e-9),
sum5_per_k = 1000*sum5/(pop_latest+1e-9),
z_inst = std_z(inst_per_k),
z_5yr = std_z(sum5_per_k),
hgc = (z_inst + z_5yr)/2) |>
arrange(desc(hgc))
hgc_top <- hgc |> slice_head(n=10)
hgc_bot <- hgc |> slice_tail(n=10)
stopifnot(length(intersect(hgc_top$cbsa, hgc_bot$cbsa))==0)
datatable_or_kable(select(hgc_top, cbsa, inst_per_k, sum5_per_k, hgc))
datatable_or_kable(select(hgc_bot, cbsa, inst_per_k, sum5_per_k, hgc))
Transparency:
p1 <- ggplot(hgc, aes(inst_per_k)) + geom_histogram(bins=30, fill="#77c5d5") + labs(title="Instant permits per 1k")
p2 <- ggplot(hgc, aes(sum5_per_k)) + geom_histogram(bins=30, fill="#f8a5c2") + labs(title="5-year permits per 1k")
p3 <- ggplot(hgc, aes(inst_per_k, sum5_per_k, text=cbsa)) + geom_point(alpha=.7) + labs(x="Inst per 1k", y="5yr per 1k")
if (is_latex()) { print(p1); print(p2); print(p3) } else { plotly::subplot(plotly::ggplotly(p1), plotly::ggplotly(p2), plotly::ggplotly(p3), nrows=1) }
HGC components and their relationship
rel <- rbi |> filter(year==latest_year) |> select(cbsa, name, rbi) |>
left_join(select(hgc, cbsa, hgc, sum5_per_k), by="cbsa")
p <- ggplot(rel, aes(hgc, rbi, text=name)) + geom_point(alpha=.75) + geom_smooth(method="lm", se=FALSE, linewidth=.6, color="#77c5d5") +
labs(x="HGC (z)", y="RBI (latest)", subtitle="Lower RBI & higher HGC → better dynamics")
plotly_or_gg(p)
## `geom_smooth()` using formula = 'y ~ x'
Affordability (RBI) vs Supply Growth (HGC)
YIMBY criteria (example): - RBI below median, HGC above median - 5yr permits per k in top 40% - Income growth above median
med_rbi <- median(rel$rbi, na.rm=TRUE)
med_hgc <- median(rel$hgc, na.rm=TRUE)
income_growth <- cbsa_panel |>
group_by(cbsa) |>
summarize(inc_growth = (last(income) - first(income))/first(income))
yimby_tbl <- rel |>
left_join(income_growth, by="cbsa") |>
mutate(flag_rbi = rbi < med_rbi,
flag_hgc = hgc > med_hgc,
flag_5yr = sum5_per_k >= quantile(sum5_per_k, .6, na.rm=TRUE),
flag_inc = inc_growth > median(inc_growth, na.rm=TRUE),
YIMBY = flag_rbi & flag_hgc & flag_5yr & flag_inc) |>
arrange(desc(YIMBY), desc(hgc))
datatable_or_kable(yimby_tbl)
Leaderboard:
leader <- yimby_tbl |>
mutate(score = as.numeric(flag_rbi)+as.numeric(flag_hgc)+as.numeric(flag_5yr)+as.numeric(flag_inc)) |>
arrange(desc(score), rbi) |>
select(name, score, rbi, hgc, sum5_per_k, inc_growth)
datatable_or_kable(leader)
DiagrammeR::grViz("
digraph {
graph [rankdir=LR, fontsize=10, labelloc=t, labeljust=l]
node [shape=box, style=rounded, color='#77c5d5']
ACS [label='ACS: rent, income, pop, hh']
PERM [label='Permits (annual, 5yr)']
JOIN [shape=oval, label='Join: CBSA + Year']
RBI [label='RBI']
HGC [label='HGC']
YIMBY [label='YIMBY Screen']
ACS -> JOIN; PERM -> JOIN
JOIN -> RBI; JOIN -> HGC
RBI -> YIMBY; HGC -> YIMBY
}
")
hi <- c("35620","31080") # NYC & LA
p <- ggplot(ungroup(cbsa_panel), aes(year, income, group=cbsa, color=cbsa %in% hi)) +
geom_line(alpha=.55) +
gghighlight::gghighlight(cbsa %in% hi, use_direct_label=FALSE) +
scale_color_manual(values=c("TRUE"="#f8a5c2","FALSE"="#c6d6f5"), guide="none") +
labs(title="Income over time — highlighted NYC & LA", x=NULL, y="USD")
## Warning: Tried to calculate with group_by(), but the calculation failed.
## Falling back to ungrouped filter operation...
plotly_or_gg(p)
# Placeholder proxy (replace with class's 25–34 share if provided)
millennial <- cbsa_panel |>
group_by(cbsa) |>
summarize(millennial_share = runif(1, .15, .35))
rel3 <- rel |>
left_join(millennial, by="cbsa")
p <- ggplot(rel3, aes(hgc, rbi, size = millennial_share, text=name)) +
geom_point(alpha=.75) + scale_size(range=c(3,10), labels=scales::percent) +
labs(x="HGC (z)", y="RBI (latest)", title="RBI vs HGC — bubble size = Millennial appeal")
plotly_or_gg(p)
Millennial appeal vs RBI/HGC (bubble size)
# Geometry (CBSA) for map - tigris
cbsa_geo <- NULL
if (offline && has_cache("cbsa_geo")){
cbsa_geo <- load_cache("cbsa_geo")
} else {
cbsa_geo <- tryCatch({
tigris::core_based_statistical_areas(cb = TRUE, year = 2023, class = "sf") |>
select(cbsa = GEOID, name = NAME, geometry)
}, error=function(e) NULL)
if (!is.null(cbsa_geo)) save_cache(cbsa_geo,"cbsa_geo")
}
if (!is.null(cbsa_geo)){
map_df <- cbsa_geo |>
left_join(select(rbi |> filter(year==latest_year), cbsa, rbi), by="cbsa") |>
left_join(select(hgc, cbsa, hgc), by="cbsa")
if (is_latex()){
tmap::tmap_mode("plot")
tm <- tmap::tm_shape(map_df) + tmap::tm_polygons(col="rbi", palette="RdYlGn", style="quantile") +
tmap::tm_layout(title = "CBSA RBI (latest)")
tm
} else {
pal <- leaflet::colorNumeric("RdYlGn", domain = map_df$rbi, reverse = TRUE)
leaflet::leaflet(map_df) |>
leaflet::addProviderTiles("CartoDB.Positron") |>
leaflet::addPolygons(fillColor = ~pal(rbi), color="#ffffff", weight=0.5, fillOpacity=.8,
popup = ~paste0("<b>", name, "</b><br>RBI: ", round(rbi,1), "<br>HGC: ", round(hgc,2))) |>
leaflet::addLegend(pal = pal, values = ~rbi, title = "RBI (higher worse)")
}
} else {
cat("Map skipped (geometry unavailable offline without tigris/sf system libs).")
}
Core-Based Statistical Area — standard metro delineation in federal stats.
Rent Burden Index: normalized rent-to-income vs a baseline (100 = baseline).
Housing Growth Composite: z-avg of instantaneous permits per 1k and 5-year permits per 1k.
Screen combining RBI, HGC, permits intensity, and income growth.
kpi <- tibble(
KPI = c("Median RBI", "Top HGC CBSA", "YIMBY Pass Rate"),
Value = c(round(median(rel$rbi, na.rm=TRUE),1),
(hgc |> arrange(desc(hgc)) |> slice(1) |> left_join(select(cbsa_panel, cbsa, name), by="cbsa") |> dplyr::pull(name) |> unique()) %||% "N/A",
scales::percent(mean((yimby_tbl$YIMBY), na.rm=TRUE), accuracy=.1))
)
gt::gt(kpi)
| KPI | Value |
|---|---|
| Median RBI | 100.3 |
| Top HGC CBSA | Guayama, PR Metro Area |
| YIMBY Pass Rate | 14.2% |
p <- ggplot(rel, aes(hgc, rbi)) + geom_point(alpha=.5) + labs(x="HGC (z)", y="RBI", title="Supply vs Affordability")
if (is_latex()) print(p) else plotly::ggplotly(p)
```